if(!requireNamespace("MSTExplorer", quietly = TRUE)){
remotes::install_github("neurogenomics/MSTExplorer", dependencies = TRUE)
}
if(!requireNamespace("echodata", quietly = TRUE)){
remotes::install_github("RajLabMSSM/echodata", dependencies = TRUE)
}
knitr::opts_chunk$set(warning = FALSE,
cache = TRUE,
message = FALSE)
CSL areas of interest mapped onto Human Phenotype Ontology (HPO) and/or other disease ontologies (OMIM/ORPH).
csl <- data.table::fread(here::here("data/CSL_areas_of_interest.tsv"))
csl <- csl[Group=="Early Stage Partnering"]
# extract IDs from the Entry column of csl data.table with the pattern "HP:", "OMIM:", "ORPHA:" and put into a new col
csl[, ID := stringr::str_extract(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+")]
# extract names from Entry column WITHOUT the ID
csl[, Name := trimws(stringr::str_remove(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+"))]
MSTExplorer::create_dt(csl)
Extract IDs of phenotypes and diseases. Get any descendants of manually mapped HPO IDs as well.
hpo_ids <- csl[grepl("HP:",ID)]$ID |> unique()
disease_ids <- csl[!grepl("HP:",ID)]$ID |> unique()
hpo <- KGExplorer::get_ontology("hp")
hpo_ids_list <- lapply(stats::setNames(hpo_ids,hpo_ids),
function(x) simona::dag_offspring(dag = hpo,
include_self = TRUE,
term = x))
hpo_ids_extended <- unlist(hpo_ids_list) |> unique()
results <- MSTExplorer::load_example_results()
results <- HPOExplorer::add_hpo_name(results, hpo = hpo)
results <- HPOExplorer::add_ont_lvl(results)
results <- HPOExplorer::add_ancestor(results, hpo = hpo)
results <- MSTExplorer::map_celltype(results)
MSTExplorer::add_logfc(results)
results[,effect:=estimate]
p2g <- HPOExplorer::load_phenotype_to_genes()
prioritise_targets_out <- MSTExplorer::prioritise_targets(
results = results,
hpo = hpo,
phenotype_to_genes = p2g,
keep_deaths=NULL,
keep_onsets=NULL,
keep_specificity_quantiles = seq(30,40), ## NULL:70, 30-40:64
keep_mean_exp_quantiles = seq(30,40), ## NULL:65, 10:55
info_content_threshold=8, ## 8:55, 5:64
effect_threshold=NULL, ## 1:39
severity_score_gpt_threshold=NULL, ## 10:78, NULL:82
symptom_intersection_threshold=.25, ## .25:57
evidence_score_threshold=3, ## 5:47, 4:47, 3:64
top_n = 10, ## 5:38, 20:42, 30:45, 40:52, 50:55
group_vars = "hpo_id")
all_targets <- prioritise_targets_out$top_targets
targets <- all_targets[
(hpo_id %in% hpo_ids_extended
| disease_id %in% disease_ids
)
]
MSTExplorer::create_dt(head(targets,100))
Summarise results:
message(paste0(
length(intersect(all_targets$hpo_id,hpo_ids_extended)),"/",
length(unique(hpo_ids_extended)),
" (",round(length(intersect(hpo_ids_extended,all_targets$hpo_id))/
length(unique(hpo_ids_extended))*100,2),"%)",
" CSL phenotypes (across ",length(unique(all_targets$disease_id))," diseases)",
" covered in our prioritised targets."
))
## 594/1860 (31.94%) CSL phenotypes (across 4850 diseases) covered in our prioritised targets.
top_targets_ancestor <- targets[!duplicated(ancestor_name),]
csl_areas <- unique(csl[,c("ID","Area","Subarea")])
targets2 <- targets |>
merge(csl_areas,
all.x=TRUE,
by.x="hpo_id",by.y="ID") |>
merge(csl_areas,
all.x=TRUE,
by.x="disease_id",by.y="ID")
targets2[,Area:=data.table::fcoalesce(Area.x,Area.y)]
targets2[,Subarea:=data.table::fcoalesce(Subarea.x,Subarea.y)]
#### Select top N targets per CSL Area ####
num_cols <- sapply(targets2,is.numeric)
top_targets <- targets2[!is.na(Area), head(.SD,3),
by=c("Area")]
# drop list columns
# save_path <- here::here("reports/top_targets_CSL.tsv")
# top_targets[,-names(top_targets)[sapply(top_targets,is.list)],with=FALSE] |>
# data.table::fwrite(save_path, sep="\t")
# top_targets <- data.table::fread(here::here("top_targets_CSL.tsv"))
MSTExplorer::create_dt(top_targets)
target_counts <- targets2[,list(n_targets=.N,
n_genes=data.table::uniqueN(gene_symbol),
n_phenotypes=data.table::uniqueN(hpo_id),
n_diseases=data.table::uniqueN(disease_id)
),by=c("Area","Subarea")][!is.na(Area)]|>
data.table::setorderv("n_targets",-1)
MSTExplorer::create_dt(target_counts)
Check for other results with the same cell type and gene target.
stroke_targets <- targets2[grepl("stroke",hpo_name, ignore.case = TRUE)]
stroke_targets <- stroke_targets[gene_symbol=="COL4A1"]
https://gnomad.broadinstitute.org/gene/ENSG00000187498?dataset=gnomad_r4
COL4A1 exonn positions: https://www.ensembl.org/Homo_sapiens/Transcript/Exons?db=core;g=ENSG00000187498;r=13:110213499-110214499;t=ENST00000375820;v=rs34004222;vdb=variation;vf=813354076
Exon 3: chr13:110213926-110214015
#### Import ####
gnomad <- data.table::fread(here::here("data/gnomAD_v4.0.0_ENSG00000187498_2024_02_08_11_13_20.csv"),
check.names = TRUE)
#### Add exon information ####
db <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
ex.gn <- GenomicFeatures::exonsBy(db, "gene")
ex <- ex.gn[["1282"]]
ex$exon_width <- GenomicRanges::width(ex)
# txs <- GenomicFeatures::transcriptsBy(db, by = "gene")
# tx.gn <- GenomicFeatures::exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene, by = "tx")
# tx <- tx.gn[["1282"]]
#### Convert to GRanges ####
gn <- echodata::dt_to_granges(gnomad,
chrom_col = "Chromosome",
start_col = "Position",
style="UCSC") |>
## Merge gnomad with exon data
IRanges::mergeByOverlaps(ex)
gn$ex=NULL
gn$`echodata::dt_to_granges(gnomad, chrom_col = "Chromosome", start_col = "Position", `=NULL
gn <- data.table::as.data.table(gn)
gn[,exon_variants_n:=data.table::uniqueN(gnomAD.ID),by="exon_id"]
gn[,exon_variants_perbp:=data.table::uniqueN(gnomAD.ID)/exon_width,by="exon_id"]
gn[,exon_pathovariants_perbp:=data.table::uniqueN(
gnomAD.ID[grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20])/exon_width,by="exon_id"]
gn[,exon_pathovariants_cumfreq:=sum(
Allele.Frequency[grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20]),by="exon_id"]
gn_top <-gn[exon_pathovariants_cumfreq %in% head(data.table::fsort(unique(exon_pathovariants_cumfreq),decreasing=TRUE),2)][
grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20,
]
gn_top[,head(.SD,1),by="exon_id",
.SDcols = c("exon_width",
"exon_variants_n",
"exon_variants_perbp",
"exon_pathovariants_perbp",
"exon_pathovariants_cumfreq")] |>
MSTExplorer::create_dt()
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] ProtGenerics_1.34.0
## [2] fs_1.6.3
## [3] matrixStats_1.2.0
## [4] bitops_1.0-7
## [5] EnsDb.Hsapiens.v75_2.99.0
## [6] httr_1.4.7
## [7] RColorBrewer_1.1-3
## [8] doParallel_1.0.17
## [9] tools_4.3.1
## [10] backports_1.4.1
## [11] utf8_1.2.4
## [12] R6_2.5.1
## [13] DT_0.32
## [14] lazyeval_0.2.2
## [15] GetoptLong_1.0.5
## [16] prettyunits_1.2.0
## [17] cli_3.6.2
## [18] Biobase_2.62.0
## [19] sass_0.4.8
## [20] readr_2.1.5
## [21] ewceData_1.10.0
## [22] Rsamtools_2.18.0
## [23] yulab.utils_0.1.4
## [24] R.utils_2.12.3
## [25] dichromat_2.0-0.1
## [26] orthogene_1.9.1
## [27] maps_3.4.2
## [28] limma_3.58.1
## [29] rstudioapi_0.15.0
## [30] RSQLite_2.3.5
## [31] pals_1.9
## [32] generics_0.1.3
## [33] gridGraphics_0.5-1
## [34] shape_1.4.6.1
## [35] BiocIO_1.12.0
## [36] crosstalk_1.2.1
## [37] gtools_3.9.5
## [38] car_3.1-2
## [39] dplyr_1.1.4
## [40] zip_2.3.1
## [41] homologene_1.4.68.19.3.27
## [42] Matrix_1.6-5
## [43] fansi_1.0.6
## [44] S4Vectors_0.40.2
## [45] abind_1.4-5
## [46] R.methodsS3_1.8.2
## [47] lifecycle_1.0.4
## [48] scatterplot3d_0.3-44
## [49] yaml_2.3.8
## [50] carData_3.0-5
## [51] SummarizedExperiment_1.32.0
## [52] gplots_3.1.3.1
## [53] SparseArray_1.2.4
## [54] BiocFileCache_2.10.1
## [55] grid_4.3.1
## [56] blob_1.2.4
## [57] promises_1.2.1
## [58] ExperimentHub_2.10.0
## [59] crayon_1.5.2
## [60] lattice_0.22-5
## [61] GenomicFeatures_1.54.4
## [62] chromote_0.2.0
## [63] KEGGREST_1.42.0
## [64] mapproj_1.2.11
## [65] pillar_1.9.0
## [66] knitr_1.45
## [67] ComplexHeatmap_2.18.0
## [68] KGExplorer_0.99.0
## [69] GenomicRanges_1.54.1
## [70] rjson_0.2.21
## [71] codetools_0.2-19
## [72] glue_1.7.0
## [73] ggfun_0.1.4
## [74] data.table_1.15.2
## [75] vctrs_0.6.5
## [76] png_0.1-8
## [77] treeio_1.26.0
## [78] gtable_0.3.4
## [79] HPOExplorer_1.0.0
## [80] cachem_1.0.8
## [81] xfun_0.42
## [82] openxlsx_4.2.5.2
## [83] S4Arrays_1.2.1
## [84] mime_0.12
## [85] tidygraph_1.3.1
## [86] SingleCellExperiment_1.24.0
## [87] RNOmni_1.0.1.2
## [88] iterators_1.0.14
## [89] simona_1.0.10
## [90] statmod_1.5.0
## [91] interactiveDisplayBase_1.40.0
## [92] ellipsis_0.3.2
## [93] nlme_3.1-164
## [94] ggtree_3.10.1
## [95] EWCE_1.11.3
## [96] bit64_4.0.5
## [97] progress_1.2.3
## [98] filelock_1.0.3
## [99] rprojroot_2.0.4
## [100] GenomeInfoDb_1.38.7
## [101] bslib_0.6.1
## [102] KernSmooth_2.23-22
## [103] colorspace_2.1-0
## [104] BiocGenerics_0.48.1
## [105] DBI_1.2.2
## [106] tidyselect_1.2.1
## [107] processx_3.8.3
## [108] bit_4.0.5
## [109] compiler_4.3.1
## [110] curl_5.2.1
## [111] rvest_1.0.4
## [112] httr2_1.0.0
## [113] xml2_1.3.6
## [114] DelayedArray_0.28.0
## [115] plotly_4.10.4
## [116] rtracklayer_1.62.0
## [117] scales_1.3.0
## [118] caTools_1.18.2
## [119] rappdirs_0.3.3
## [120] stringr_1.5.1
## [121] digest_0.6.35
## [122] piggyback_0.1.5
## [123] rmarkdown_2.26
## [124] XVector_0.42.0
## [125] htmltools_0.5.7
## [126] pkgconfig_2.0.3
## [127] GeneOverlap_1.38.0
## [128] MatrixGenerics_1.14.0
## [129] echodata_0.99.17
## [130] dbplyr_2.4.0
## [131] fastmap_1.1.1
## [132] ensembldb_2.26.0
## [133] rlang_1.1.3
## [134] GlobalOptions_0.1.2
## [135] htmlwidgets_1.6.4
## [136] shiny_1.8.0
## [137] jquerylib_0.1.4
## [138] jsonlite_1.8.8
## [139] BiocParallel_1.36.0
## [140] R.oo_1.26.0
## [141] RCurl_1.98-1.14
## [142] magrittr_2.0.3
## [143] GenomeInfoDbData_1.2.11
## [144] ggplotify_0.1.2
## [145] patchwork_1.2.0
## [146] munsell_0.5.0
## [147] Rcpp_1.0.12
## [148] ape_5.7-1
## [149] babelgene_22.9
## [150] stringi_1.8.3
## [151] zlibbioc_1.48.0
## [152] AnnotationHub_3.10.0
## [153] plyr_1.8.9
## [154] parallel_4.3.1
## [155] Biostrings_2.70.2
## [156] hms_1.1.3
## [157] circlize_0.4.16
## [158] ps_1.7.6
## [159] igraph_2.0.3
## [160] ggpubr_0.6.0
## [161] ggsignif_0.6.4
## [162] reshape2_1.4.4
## [163] biomaRt_2.58.2
## [164] stats4_4.3.1
## [165] gprofiler2_0.2.3
## [166] BiocVersion_3.18.1
## [167] XML_3.99-0.16.1
## [168] evaluate_0.23
## [169] BiocManager_1.30.22
## [170] tzdb_0.4.0
## [171] foreach_1.5.2
## [172] httpuv_1.6.14
## [173] rols_2.30.2
## [174] grr_0.9.5
## [175] tidyr_1.3.1
## [176] purrr_1.0.2
## [177] clue_0.3-65
## [178] ggplot2_3.5.0
## [179] broom_1.0.5
## [180] xtable_1.8-4
## [181] restfulr_0.0.15
## [182] AnnotationFilter_1.26.0
## [183] tidytree_0.4.6
## [184] rstatix_0.7.2
## [185] later_1.3.2
## [186] viridisLite_0.4.2
## [187] MSTExplorer_1.0.0
## [188] TxDb.Hsapiens.UCSC.hg38.knownGene_3.18.0
## [189] Polychrome_1.5.1
## [190] tibble_3.2.1
## [191] websocket_1.4.1
## [192] aplot_0.2.2
## [193] memoise_2.0.1.9000
## [194] AnnotationDbi_1.64.1
## [195] GenomicAlignments_1.38.2
## [196] IRanges_2.36.0
## [197] cluster_2.1.6
## [198] HGNChelper_0.8.1
## [199] here_1.0.1